Get Data

a = read_csv('hotel_bookings.csv') %>%
  clean_names() %>% 
  mutate(across(where(is.character), factor)) %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is.numeric)
  ) %>% filter(is_canceled == 0) #filter to non-canceled bookings

a$is_canceled = NULL
a$reservation_status_date = NULL

sample data

a %>% sample_n(10)

check for missing values

a %>% miss_var_summary()

For Thorough EDA, check out my other publication ‘Predicting Hotel Bookings & Prices’’

examining outcome var in more detail

a %>% select(adr) %>% plot_ly(x = ~adr) %>% add_histogram()
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
a %>% select(adr) %>% plot_ly(x = ~adr) %>% add_boxplot()

Weird how there are avg daily rates less than $10 and those greater than 400’ let’s exclude theme’’

#removing bottom outliers

a = a %>% filter(adr > 10, adr < 400)

filter to 2 datasets: resort hotel, and city hotel

r = a %>% filter(hotel == 'Resort Hotel')
c = a %>% filter(hotel == 'City Hotel') 

removing [now] useless cols

r$hotel = NULL
c$hotel = NULL

City Hotels: Tidy Split -> Prep -> Process

1) Split Data

set.seed(345)
##!!<NOTE> limiting data for speed purposes
c.split = initial_split(c %>% slice_sample(prop = 0.50)) 
c.train = rsample::training(c.split)
c.test = rsample::testing(c.split)
c.vfolds = vfold_cv(c.train, v = 5)

2) Create preprocessing recipe

# Documentation: https://www.rdocumentation.org/packages/recipes/versions/0.1.14

#a recipe is used for preprocessing
c.recipe = c.train %>% recipe(adr ~ . ) %>%
  #----------------------------
  step_mutate(
   arrival.date = lubridate::make_date(
     arrival_date_year,
     match(arrival_date_month, month.name),
     arrival_date_day_of_month
     )
   ) %>%
  #enets can only use numeric data, so convert arrival.date to numeric
  step_mutate(
    arrival.date = arrival.date %>%
      as.character %>%
      stringr::str_replace_all('-','') %>%
      as.numeric
  ) %>%
  #arrival_date_month needs to be converted from factor > numeric  
  step_mutate(
    arrival_date_month = match(arrival_date_month, month.name)
  ) %>% 
  #----------------------------
  #remove vars with low or now correlation
  step_corr(all_numeric(),-all_outcomes()) %>% 
  #remove vars with low or no variance
  step_nzv(all_numeric(),-all_outcomes()) %>% 
  step_zv(all_numeric(),-all_outcomes()) %>%
  #----------------------------
  #reduce number of levels for factors with many, many levels
  step_other(agent, company, country) %>%  #default threshold of 0.05
  #----------------------------
  #create dummy vars
  step_dummy(
    agent, assigned_room_type, company,
    country, customer_type, deposit_type,
    distribution_channel, market_segment, meal,
    reservation_status, reserved_room_type,
    one_hot = TRUE
    ) %>% 
  #----------------------------
  step_normalize(stays_in_weekend_nights, stays_in_week_nights) %>% 
  step_pca(stays_in_weekend_nights, stays_in_week_nights, num_comp = 1) #will limit to PC1 only
  
c.recipe %>% tidy

check interaction of recipe with vars

(c.recipe %>% prep)
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         28
## 
## Training data contained 16902 data points and no missing data.
## 
## Operations:
## 
## Variable mutation for arrival.date [trained]
## Variable mutation for arrival.date [trained]
## Variable mutation for arrival_date_month [trained]
## Correlation filter removed 2 items [trained]
## Sparse, unbalanced variable filter removed babies, children, ... [trained]
## Zero variance filter removed no terms [trained]
## Collapsing factor levels for agent, company, country [trained]
## Dummy variables from agent, assigned_room_type, company, ... [trained]
## Centering and scaling for stays_in_weekend_nights, stays_in_week_nights [trained]
## PCA extraction with stays_in_weekend_nights, stays_in_week_nights [trained]

3) OPTIONAL: independently process train & test

#'Using the recipe, prep & bake the train ds'
c.baked.train = c.recipe %>% prep() %>% bake(new_data = NULL) %>%
  select(sort(tidyselect::peek_vars()))

#'Using the recipe, prep & bake the test ds'
c.baked.test = c.recipe %>% prep() %>% bake(new_data = c.test) %>%
  select(sort(tidyselect::peek_vars()))

c.baked.train %>% head() %>% DT::datatable()
c.baked.test %>% head %>% DT::datatable()

City Hotels: TidyModeling

en = ‘elastic net’

1) Create model Specs w/hps to be tuned

#hyperparameters with a value of 'tune()' are those we will 'tune'
c.en.mdl = parsnip::linear_reg(
  penalty = tune(), 
  mixture = tune() #lasso / ridge mix
) %>% 
  set_mode('regression') %>% 
  set_engine('glmnet')

2) Create workflow Specs

#create workflow (pipeline) combining recipe (preprocessing) and model (w/hps)
c.en.wf = workflow() %>% 
  add_recipe(c.recipe) %>% 
  add_model(c.en.mdl)

3) Execute model & workflow on vfold ds using auto-hps Execution

doParallel::registerDoParallel() #use parallel processing
set.seed(345)

c.en.tg = tune_grid(
  c.en.wf,
  resamples = c.vfolds,
  grid = 10) #Create a tuning grid AUTOMATICALLY

c.en.tg
c.en.tg %>% collect_metrics()

viz results

ggplotly(
c.en.tg %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>%
  select(mean, penalty, mixture) %>%
  pivot_longer(penalty:mixture,
    values_to = "value",
    names_to = "parameter"
  ) %>%
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(show.legend = FALSE, size = 3) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(x = NULL, y = "RMSE")
)
c.en.tg %>%
  collect_metrics() %>% 
  filter(.metric == 'rmse') %>% 
  select(mean, penalty, mixture, .config) %>% 
  arrange(mean)

4) OPTIONAL: Manually create tuning grids

#Easiest Option
(en.grid.xpd = expand_grid(
  mixture = seq(0.40, 0.50, by = 0.03),
  penalty = seq(0.25, 0.35, by = 0.02)
  ))

en.params <- parameters(penalty(), mixture())

(en.grid.reg = grid_regular(en.params, levels = c(5, 5)))

# creates a SFD (space filling design grid), keeps param combinations as far away from each other
(en.grid.rdm = grid_max_entropy(en.params, size = 15))


en.grid.reg %>% 
  ggplot(aes(x = mixture, y = penalty)) +
  geom_point() +
  scale_y_log10()


en.grid.rdm %>% 
  ggplot(aes(x = mixture, y = penalty)) +
  geom_point() +
  scale_y_log10()

en.grid.xpd %>% 
  ggplot(aes(x = mixture, y = penalty)) +
  geom_point() +
  scale_y_log10()

5) choose best hps

(c.en.best.hps = select_best(c.en.tg, 'rmse'))

7) finalize workflow, fit & execute model, evaluate metrics

c.en.wf %>%
  #1) finalize wf (recipe, model w/previously unknown hps) using best hps
  finalize_workflow(c.en.best.hps) %>%
  #2) fit on entire train and then execute/predict on test
  last_fit(c.split) %>%
  #3) evaluate metrics
  collect_predictions() %>%
  select(.pred, adr) %>%
  #'metric_set(rmse, rsq, mae)' is actually a in-line formula you create
  metric_set(rmse, rsq, mae)(truth = adr, estimate = .pred)

Resort Hotels: Tidy Split -> Prep -> Process

1) Split Data

set.seed(345)
##!!<NOTE> limiting data for speed purposes
r.split = initial_split(r %>% slice_sample(prop = 0.50)) 
r.train = rsample::training(r.split)
r.test = rsample::testing(r.split)
r.vfolds = vfold_cv(r.train, v = 5)

2) Create preprocessing recipe

# Documentation: https://www.rdocumentation.org/packages/recipes/versions/0.1.14

#a recipe is used for preprocessing
r.recipe = r.train %>% recipe(adr ~ . ) %>%
  #----------------------------
  step_mutate(
   arrival.date = lubridate::make_date(
     arrival_date_year,
     match(arrival_date_month, month.name),
     arrival_date_day_of_month
     )
   ) %>%
  #enets can only use numeric data, so convert arrival.date to numeric
  step_mutate(
    arrival.date = arrival.date %>%
      as.character %>%
      stringr::str_replace_all('-','') %>%
      as.numeric
  ) %>%
  #arrival_date_month needs to be converted from factor > numeric  
  step_mutate(
    arrival_date_month = match(arrival_date_month, month.name)
  ) %>% 
  #----------------------------
  #remove vars with low or now correlation
  step_corr(all_numeric(),-all_outcomes()) %>% 
  #remove vars with low or no variance
  step_nzv(all_numeric(),-all_outcomes()) %>% 
  step_zv(all_numeric(),-all_outcomes()) %>%
  #----------------------------
  #reduce number of levels for factors with many, many levels
  step_other(agent, company, country) %>%  #default threshold of 0.05
  #----------------------------
  #create dummy vars
  step_dummy(
    agent, assigned_room_type, company,
    country, customer_type, deposit_type,
    distribution_channel, market_segment, meal,
    reservation_status, reserved_room_type,
    one_hot = TRUE
    ) %>% 
  #----------------------------
  step_normalize(stays_in_weekend_nights, stays_in_week_nights) %>% 
  step_pca(stays_in_weekend_nights, stays_in_week_nights, num_comp = 1) #will limit to PC1 only
  
r.recipe %>% tidy

check interaction of recipe with vars

(r.recipe %>% prep)
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         28
## 
## Training data contained 10571 data points and no missing data.
## 
## Operations:
## 
## Variable mutation for arrival.date [trained]
## Variable mutation for arrival.date [trained]
## Variable mutation for arrival_date_month [trained]
## Correlation filter removed 2 items [trained]
## Sparse, unbalanced variable filter removed babies, children, ... [trained]
## Zero variance filter removed no terms [trained]
## Collapsing factor levels for agent, company, country [trained]
## Dummy variables from agent, assigned_room_type, company, ... [trained]
## Centering and scaling for stays_in_weekend_nights, stays_in_week_nights [trained]
## PCA extraction with stays_in_weekend_nights, stays_in_week_nights [trained]

3) OPTIONAL: independently process train & test

#'Using the recipe, prep & bake the train ds'
r.baked.train = r.recipe %>% prep() %>% bake(new_data = NULL) %>%
  select(sort(tidyselect::peek_vars()))

#'Using the recipe, prep & bake the test ds'
r.baked.test = r.recipe %>% prep() %>% bake(new_data = r.test) %>%
  select(sort(tidyselect::peek_vars()))

r.baked.train %>% head() %>% DT::datatable()
r.baked.test %>% head %>% DT::datatable()

City Hotels: TidyModeling

en = ‘elastic net’

1) Create model Specs w/hps to be tuned

#hyperparameters with a value of 'tune()' are those we will 'tune'
r.en.mdl = parsnip::linear_reg(
  penalty = tune(), 
  mixture = tune() #lasso / ridge mix
) %>% 
  set_mode('regression') %>% 
  set_engine('glmnet')

2) Create workflow Specs

#create workflow (pipeline) combining recipe (preprocessing) and model (w/hps)
r.en.wf = workflow() %>% 
  add_recipe(r.recipe) %>% 
  add_model(r.en.mdl)

3) Execute model & workflow on vfold ds using auto-hps Execution

doParallel::registerDoParallel() #use parallel processing
set.seed(345)

r.en.tg = tune_grid(
  r.en.wf,
  resamples = r.vfolds,
  grid = 10) #Create a tuning grid AUTOMATICALLY

r.en.tg
r.en.tg %>% collect_metrics()

viz results

ggplotly(
r.en.tg %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>%
  select(mean, penalty, mixture) %>%
  pivot_longer(penalty:mixture,
    values_to = "value",
    names_to = "parameter"
  ) %>%
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(show.legend = FALSE, size = 3) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(x = NULL, y = "RMSE")
)
r.en.tg %>%
  collect_metrics() %>% 
  filter(.metric == 'rmse') %>% 
  select(mean, penalty, mixture, .config) %>% 
  arrange(mean)

4) OPTIONAL: Manually create tuning grids

#Easiest Option
(en.grid.xpd = expand_grid(
  mixture = seq(0.40, 0.50, by = 0.03),
  penalty = seq(0.25, 0.35, by = 0.02)
  ))

en.params <- parameters(penalty(), mixture())

(en.grid.reg = grid_regular(en.params, levels = c(5, 5)))

# creates a SFD (space filling design grid), keeps param combinations as far away from each other
(en.grid.rdm = grid_max_entropy(en.params, size = 15))


en.grid.reg %>% 
  ggplot(aes(x = mixture, y = penalty)) +
  geom_point() +
  scale_y_log10()


en.grid.rdm %>% 
  ggplot(aes(x = mixture, y = penalty)) +
  geom_point() +
  scale_y_log10()

en.grid.xpd %>% 
  ggplot(aes(x = mixture, y = penalty)) +
  geom_point() +
  scale_y_log10()

5) choose best hps

(r.en.best.hps = select_best(r.en.tg, 'rmse'))

7) finalize workflow, fit & execute model, evaluate metrics

r.en.wf %>%
  #1) finalize wf (recipe, model w/previously unknown hps) using best hps
  finalize_workflow(r.en.best.hps) %>%
  #2) fit on entire train and then execute/predict on test
  last_fit(r.split) %>%
  #3) evaluate metrics
  collect_predictions() %>%
  select(.pred, adr) %>%
  #'metric_set(rmse, rsq, mae)' is actually a in-line formula you create
  metric_set(rmse, rsq, mae)(truth = adr, estimate = .pred)